def get_paper(link):
    paper = link.split("/")[2]
    return paper

import pickle
import datetime

#output files
art_out = open("NER - Articles Dataset - 2018 Final.csv", "w")
art_out.write(";".join(
        ('story', "article_link", "story size", "story size paper", "num_previous", "num_previous_paper", "num_follow_up",
         "num_follow_up_paper", "time_after_initial", "time_after_initial_paper")))
art_out.write("\n")


#input file
obj = pickle.load(open("communities (2).pickle", "rb"))

print("Processing")

#each item in this dictionary is a story cluster
for i, cluster in enumerate(obj):
    

    for article in obj[cluster]:
        paper = get_paper(article)
        time = obj[cluster][article]["time"]
        num_total = len(obj[cluster])
        num_paper = 0
        num_previous = 0
        num_previous_paper = 0
        num_follow_up = 0
        num_follow_up_paper = 0

        start = time
        papstart = time

        #this will compare the story to itself at one point, but that's fine
        for other_story in obj[cluster]:
            other_paper = get_paper(other_story)

            if paper == other_paper:
                num_paper+=1
                if obj[cluster][other_story]["time"] < start:
                    papstart = obj[cluster][other_story]["time"]

            if obj[cluster][other_story]["time"] < time:
                num_previous += 1
                if paper == other_paper:
                    num_previous_paper += 1

            if obj[cluster][other_story]["time"] > time:
                num_follow_up += 1
                if paper == other_paper:
                    num_follow_up_paper += 1

            if obj[cluster][other_story]["time"] < start:
                start = obj[cluster][other_story]["time"]

        
        #turn days and seconds into hours
        duration = (time - start).days
        duration = duration * 24
        remainder = (time - start).seconds
        remainder = (remainder / 60) / 60
        duration = duration + remainder

        pduration = (time - papstart).days
        pduration = pduration * 24
        premainder = (time - papstart).seconds
        premainder = (premainder / 60) / 60
        pduration = pduration + premainder


        art_out.write(";".join(
            (str(i), article, str(num_total), str(num_paper), str(num_previous), str(num_previous_paper),
             str(num_follow_up), str(num_follow_up_paper), str(duration), str(pduration))))
        art_out.write("\n")

art_out.close()

print("Done")

            
            

    

